library(tm)
library(tidytext)
library(tidyverse)
library(DT)
urldata<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv'
hm_data <- read_csv(urldata)
head(hm_data)
corpus <- VCorpus(VectorSource(hm_data$cleaned_hm))
head(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 6
## Eliminating Extra Whitespace
corpus <- tm_map(corpus, stripWhitespace)
## Convert to Lower Case
corpus <- tm_map(corpus, content_transformer(tolower))
## Remove punctuation
corpus<- tm_map(corpus, removePunctuation)
## Remove punctuation numbers
corpus<- tm_map(corpus, removeNumbers)
## Remove empty words
corpus<- tm_map(corpus, removeWords, character(0))
Stemming reduces a word to its word stem. We stem the words here and then convert the “tm” object to a “tidy” object for much faster processing.
stem <- tm_map(corpus, stemDocument) ##extrat stems
tidy_corpus <- tidy(stem) ##convert the "tm" object to a "tidy" object for much faster processing
stem_corpus <- select(tidy_corpus, text) ## only need text
We also need a dictionary to look up the words corresponding to the stems.
dict0 <- tidy(corpus)
text_dict0 <- select(dict0,text)
dict<- unnest_tokens(text_dict0,dictionary, text) ##split sentence
## into words and collect them
data("stop_words") ## use "stop_words"
word <- c("happy","ago","yesterday","tomorrow","lot","today","months","month","happier","happiest","last","week","day","past") ## add
## some stopwords
stop_words <- stop_words %>%
bind_rows(mutate(tibble(word), lexicon = "added")) ## update
##stopwords by add some words we think is not important
Here we combine the stems and the dictionary into the same “tidy” object.
completed_corpus1 <- stem_corpus %>%
mutate(id = row_number()) %>% ## rank according to row number
unnest_tokens(stems, text) %>%
bind_cols(dict) %>% ## bind stems and dictionary
anti_join(stop_words, by = c("dictionary" = "word"))## remove
## the words in stop_words
Lastly, we complete the stems by picking the corresponding word with the highest frequency.
completed_corpus2 <- completed_corpus1 %>%
group_by(stems) %>% ## group by "stems"
count(dictionary) %>% ## count the corresponding word in dict
mutate(word = dictionary[which.max(n)]) %>% ## keep the words
## with highest frequency
ungroup() %>%
select(stems, word) %>%
distinct() %>% ## delete the repeated ones and make one stem
## correspondeing to one word
right_join(completed_corpus1) ## join with completed_corpus1
completed_corpus2 <- select(completed_corpus2, -stems)## remove stem
We want our processed words to resemble the structure of the original happy moments. So we paste the words together to form happy moments.
completed_corpus3 <- completed_corpus2 %>%
group_by(id) %>%
summarise(text = str_c(word, collapse = " ")) %>%
ungroup()
hm_data <- hm_data %>%
mutate(id = row_number()) %>%
inner_join(completed_corpus3) ##connect
datatable(hm_data) ##show the final data